Chapter 3 Data quality
3.1 Load statistics
read_stats <- read_tsv("data/reports/multiqc_fastqc.txt",
col_types = cols_only("Sample" = col_character(),
"Total Sequences" = col_double(),
"%GC" = col_double(),
"total_deduplicated_percentage" = col_double())) %>%
mutate(Sample = str_extract(Sample, "M\\d+")) %>%
rename(microsample=Sample,total_sequences="Total Sequences",percent_gc="%GC",percent_unique=total_deduplicated_percentage) %>%
group_by(microsample) %>%
summarise(total_sequences=sum(total_sequences), percent_unique=mean(percent_unique), percent_gc=mean(percent_gc))host_mapping_stats <- read_tsv("data/reports/multiqc_samtools_flagstat.txt") %>%
mutate(reference = case_when(
grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
TRUE ~ NA )) %>%
filter(reference=="chicken") %>%
mutate(Sample = str_extract(Sample, "M\\d+")) %>%
rename(microsample=Sample,reads_mapped_host=mapped_passed,reads_mapped_host_percent=mapped_passed_pct) %>%
select(microsample,reads_mapped_host,reads_mapped_host_percent) %>%
group_by(microsample) %>%
summarise(reads_mapped_host=sum(reads_mapped_host),reads_mapped_host_percent=mean(reads_mapped_host_percent))human_mapping_stats <- read_tsv("data/reports/multiqc_samtools_flagstat.txt") %>%
mutate(reference = case_when(
grepl("GRCh38", Sample, ignore.case = TRUE) ~ "human",
grepl("GRCg7b", Sample, ignore.case = TRUE) ~ "chicken",
TRUE ~ NA )) %>%
filter(reference=="human") %>%
mutate(Sample = str_extract(Sample, "M\\d+")) %>%
rename(microsample=Sample, reads_mapped_human=mapped_passed,reads_mapped_human_percent=mapped_passed_pct) %>%
select(microsample,reads_mapped_human,reads_mapped_human_percent) %>%
group_by(microsample) %>%
summarise(reads_mapped_human=sum(reads_mapped_human),reads_mapped_human_percent=mean(reads_mapped_human_percent))quantification_stats <- read_tsv("data/reports/multiqc_samtools_stats.txt") %>%
filter(str_detect(Sample, "mgg-pbdrep")) %>%
mutate(Sample = str_extract(Sample, "M\\d+")) %>%
rename(microsample=Sample) %>%
group_by(microsample) %>%
summarise(reads_mapped=sum(reads_mapped),reads_mapped_percent=mean(reads_mapped_percent))3.2 Individual overview
3.2.1 Sequencing depth
quality_stats %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
mutate(animal=substr(cryosection,1,4)) %>%
ggplot(aes(x=total_sequences,y=microsample,fill=animal))+
geom_col()+
scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
geom_vline(xintercept=10000000, linetype="dashed", color = "red", size=1) +
facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
theme(strip.text.y.left = element_text(angle = 0)) +
labs(x="Number of reads", y="Microsamples", fill="Library protocol")3.2.2 Sequence duplication
quality_stats %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
mutate(animal=substr(cryosection,1,4)) %>%
ggplot(aes(x=percent_unique,y=microsample,fill=animal))+
geom_col()+
scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
geom_vline(xintercept=35, linetype="dashed", color = "red", size=1) +
facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
theme(strip.text.y.left = element_text(angle = 0)) +
labs(x="Percentage of non-duplicates", y="Microsamples", fill="Collection success")3.2.3 GC %
quality_stats %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
mutate(animal=substr(cryosection,1,4)) %>%
ggplot(aes(x=percent_gc,y=microsample,fill=animal))+
geom_col()+
scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
geom_vline(xintercept=60, linetype="dashed", color = "red", size=1) +
facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
theme(strip.text.y.left = element_text(angle = 0)) +
labs(x="Percentage of GC", y="Microsamples", fill="Library protocol")3.2.4 Host %
quality_stats %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
mutate(animal=substr(cryosection,1,4)) %>%
ggplot(aes(x=reads_mapped_host_percent,y=microsample,fill=animal))+
geom_col()+
scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
theme(strip.text.y.left = element_text(angle = 0)) +
labs(x="Host %", y="Microsamples", fill="Library protocol")3.2.5 Human %
quality_stats %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
mutate(animal=substr(cryosection,1,4)) %>%
ggplot(aes(x=reads_mapped_human_percent,y=microsample,fill=animal))+
geom_col()+
scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
geom_vline(xintercept=5, linetype="dashed", color = "red", size=1) +
facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
theme(strip.text.y.left = element_text(angle = 0)) +
labs(x="Human %", y="Microsamples", fill="Library protocol")3.2.6 Bacteria mapping %
quality_stats %>%
left_join(sample_metadata,by="microsample") %>%
mutate(animal=substr(cryosection,1,4)) %>%
filter(section != "Ileum") %>%
ggplot(aes(x=reads_mapped_percent,y=microsample,fill=animal))+
geom_col()+
scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
geom_vline(xintercept=75, linetype="dashed", color = "red", size=1) +
facet_nested(animal + type ~ ., scales="free", space="free", switch = "y") +
theme(strip.text.y.left = element_text(angle = 0)) +
labs(x="Mapped to MAGs (%)", y="Microsamples", fill="Library protocol")3.3 Biplots
3.3.1 Sequencing depth vs. GC %
quality_stats %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
filter(type == "Positive") %>%
mutate(animal=substr(cryosection,1,4)) %>%
ggplot(aes(x=percent_gc,y=total_sequences,color=animal))+
geom_point()+
scale_color_manual(values=c("#a3d1cf","#d1a3cf")) +
facet_nested(. ~ batch, scales="free") +
labs(color="Sexrion")3.3.2 Unique sequences vs. GC %
quality_stats %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
filter(type == "Positive") %>%
mutate(animal=substr(cryosection,1,4)) %>%
ggplot(aes(x=percent_gc,y=percent_unique,color=animal))+
geom_point()+
scale_color_manual(values=c("#a3d1cf","#d1a3cf")) +
facet_nested(. ~ batch, scales="free")+
labs(color="Library protocol")3.4 Quality flagging
quality <- quality_stats %>%
mutate(depth = case_when(
total_sequences <= 10000000 ~ 0,
total_sequences > 10000000 ~ 1,
TRUE ~ NA)) %>%
mutate(duplicates = case_when(
percent_unique <= 35 ~ 0,
percent_unique > 35 ~ 1,
TRUE ~ NA)) %>%
mutate(gc = case_when(
percent_gc >= 60 ~ 0,
percent_gc < 60 ~ 1,
TRUE ~ NA)) %>%
mutate(human = case_when(
reads_mapped_human_percent >= 5 ~ 0,
reads_mapped_human_percent < 5 ~ 1,
TRUE ~ NA)) %>%
mutate(bacteria = case_when(
reads_mapped_percent <= 75 ~ 0,
reads_mapped_percent > 75 ~ 1,
TRUE ~ NA)) %>%
select(microsample, depth, duplicates, gc, human, bacteria) %>%
mutate(quality = depth + duplicates + gc + human + bacteria) %>%
select(microsample, quality)
quality %>% write_tsv("results/quality.tsv")3.4.1 Quality overview
quality %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
mutate(animal=substr(cryosection,1,4)) %>%
ggplot(aes(x=quality,y=microsample,fill=animal))+
geom_col()+
scale_fill_manual(values=c("#a3d1cf","#d1a3cf")) +
geom_vline(xintercept=5, linetype="dashed", color = "red", size=1) +
facet_nested(batch + section + type ~ ., scales="free", space="free", switch = "y") +
theme(strip.text.y.left = element_text(angle = 0)) +
labs(x="Quality score", y="Microsamples", fill="Collection success")quality %>%
left_join(sample_metadata,by=join_by(microsample==microsample)) %>%
filter(section != "Ileum") %>%
filter(type == "Positive") %>%
group_by(section) %>%
summarise(average=mean(quality, na.rm=TRUE), percentage_5 = mean(quality == 5, na.rm = TRUE) * 100) %>%
tt()| section | average | percentage_5 |
|---|---|---|
| Colon | 2.972222 | 9.027778 |